1. Preparing for the analysis

1.1 R version

R.version
##                _                           
## platform       x86_64-apple-darwin20       
## arch           x86_64                      
## os             darwin20                    
## system         x86_64, darwin20            
## status                                     
## major          4                           
## minor          4.0                         
## year           2024                        
## month          04                          
## day            24                          
## svn rev        86474                       
## language       R                           
## version.string R version 4.4.0 (2024-04-24)
## nickname       Puppy Cup

1.2 Packages

library(knitr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
## here() starts at /Users/yxy/Dropbox/Collaborations/3006_iterative
library(broom)
library(broom.mixed)
library(writexl)
library(ggpubr)
library(gghighlight)
library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(lmerTest)
## 
## Attaching package: 'lmerTest'
## 
## The following object is masked from 'package:lme4':
## 
##     lmer
## 
## The following object is masked from 'package:stats':
## 
##     step
library(emmeans)
## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'
library(sjPlot)
## #refugeeswelcome
library(performance)
library(RColorBrewer)

1.3 Color palette

color_list <- c(
  "#FF9646", # orange for VH
  "#5FB991" # green for VD
  )

2. Experiment 1: acquisition of phonological variation

2.1 Importing the data of Exp1

exp1_data_original <- read_csv(here("new_analysis_202406", "data_for_new_analysis", "cleaned_data", "exp1_new_cleaned_data_20240607.csv"))
## Rows: 9108 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): condition, participant, phase, training_item, training_stem, train...
## dbl  (4): training_loop, training_trial_num, training_rt, testing_trial_num
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
exp1_data_original

2.2 Filtering out participants with poor performances (unseen choices >= 1/3)

exp1_data_good <- exp1_data_original %>% 
  mutate(
    # add a column for each of the three choices: yes ~ 1, no ~ 0
    harmonic_index = case_when(
      response_type == "harmonic" ~ 1,
      TRUE ~ 0 
    ),
    disharmonic_index = case_when(
      response_type == "disharmonic" ~ 1,
      TRUE ~ 0
    ),
    unseen_index = case_when(
      response_type == "unseen" ~ 1,
      TRUE ~ 0
    )
  ) %>% 
  group_by(participant) %>% 
  mutate(unseen_rate = sum(unseen_index) / 42) %>% 
  ungroup() %>% 
  filter(unseen_rate < 1/3)

exp1_data_good

2.3 The participant distribution

2.3.1 Before exclusion

exp1_data_original %>% select(condition, participant) %>% 
  distinct() %>% 
  group_by(condition) %>% 
  count()

2.3.2 After exclusion

exp1_data_good %>% select(condition, participant) %>% 
  distinct() %>% 
  group_by(condition) %>% 
  count()

2.4 Checking the unseen choice trials

2.4.1 Keeping only the unseen choice trials

exp1_unseen_only <- exp1_data_good %>% 
  filter(response_type == "unseen") %>% 
  mutate(
    unseen_dominant_index = case_when(
      condition == "vh" & unseen_type == "harmonic" ~ 1,
      condition == "vd" & unseen_type == "disharmonic" ~ 1,
      TRUE ~ 0
    ))

exp1_unseen_only

2.4.2 Checking the rates of choosing the unseen choices of the dominant pattern

No difference between two conditions; the unseen choice trials can be excluded

exp1_unseen_only %>% 
  group_by(condition) %>% 
  summarize(total = length(unseen_type),
            n_dominant_unseen = sum(unseen_dominant_index),
            dominant_unseen_rate = n_dominant_unseen / total)
t.test(unseen_dominant_index ~ condition, data = exp1_unseen_only) %>% 
  tidy()

2.5 Finalizing the data for analysis

Excluding the unseen choice trial;
Marking the dominant choices.

exp1_data <- exp1_data_good %>% 
  filter(response_type != "unseen") %>% 
  mutate(
    dominant_index = case_when(
      condition == "vh" & response_type == "harmonic" ~ 1,
      condition == "vd" & response_type == "disharmonic" ~ 1,
      TRUE ~ 0
    )
  )

exp1_data

2.6 Running a regression model

2.6.1 Preparing for the model

Setting the reference level: VH

exp1_data$condition <- factor(exp1_data$condition, levels = c("vd", "vh"))

Setting up sum coding scheme

contrasts(exp1_data$condition) <- contr.sum(2)
contrasts(exp1_data$condition)
##    [,1]
## vd    1
## vh   -1

2.6.2 The model

exp1_model <- glmer(data = exp1_data,
                    dominant_index ~ condition + (1 | participant) + (1 | testing_stem),
                    family = "binomial")
summary(exp1_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: dominant_index ~ condition + (1 | participant) + (1 | testing_stem)
##    Data: exp1_data
## 
##      AIC      BIC   logLik deviance df.resid 
##   2840.4   2863.0  -1416.2   2832.4     2123 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.9012 -1.0366  0.6430  0.8287  1.3620 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev.
##  participant  (Intercept) 0.1200   0.3464  
##  testing_stem (Intercept) 0.0524   0.2289  
## Number of obs: 2127, groups:  participant, 56; testing_stem, 42
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.37372    0.07426   5.033 4.84e-07 ***
## condition1  -0.28269    0.06529  -4.330 1.49e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##            (Intr)
## condition1 -0.096

2.6.3 Comparing the two condition against the input

The VH dominant condition

exp1_data_vh <- exp1_data %>% filter(condition == "vh")
t.test(exp1_data_vh$dominant_index, mu = 0.72) %>% tidy()

The VD dominant condition

exp1_data_vd <- exp1_data %>% filter(condition == "vd")
t.test(exp1_data_vd$dominant_index, mu = 0.72) %>% tidy()

2.7 Plotting Exp1

2.7.1 Summary of estimates for plot

exp1_summary <- emmeans(exp1_model, pairwise ~ condition, type = "response")$emmeans %>% 
  tidy(conf.int = TRUE)
exp1_summary

2.7.2 The plot

Setting the level order

exp1_summary$condition <- factor(exp1_summary$condition, levels = c("vh", "vd"))
exp1_plot <- ggplot(
  data = exp1_summary,
  aes(
    x = condition,
    y = prob
    )
  ) +
  
  geom_bar(
    stat = "identity",
    aes(
      fill = condition
    ),
    color = "black",
    width = 0.8,
    alpha = 0.85
  ) +
  
  geom_errorbar(
    aes(
      ymin = conf.low,
      ymax = conf.high
    ),
    width = 0.2,
    alpha = 0.6
  ) +
  
  geom_hline(yintercept = 0.5,
             linetype = "dashed",
             size = 0.75,
             color = "grey70") +
  
  geom_hline(yintercept = 0.72,
             linetype = "dashed",
             size = 0.75,
             color = "grey70") +
  
  geom_signif(
    comparisons = list(c("vh", "vd")),
    y_position = 0.8,
    tip_length = 0.2,
    annotations = "***",
    textsize = 12
    ) +
  
  scale_fill_manual(values = c(color_list[1], color_list[2])) +
  
  scale_x_discrete(limits = c("vh", "vd"),
                   labels = c("Harmony-\ndominant", "Disharmony-\ndominant")) +
  
  scale_y_continuous(
    limits = c(0, 1),
    breaks = seq(0, 1, by = 0.25),
    labels = scales::percent
    ) +
  
  labs(x = "Condition", 
       y = "% Dominant pattern") +
  
  theme_bw(base_size = 25) +
  
  theme(
    legend.position = "none",
    axis.text.x = element_text(colour = "black"),
    axis.text.y = element_text(colour = "black")
    )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
exp1_plot

2.7.3 Saving the plot

# ggsave(plot = exp1_plot,
#        filename = here("new_analysis_202406", "figures", "exp1_plot.pdf"),
#        height = 6,
#        width = 6)
# 
# ggsave(plot = exp1_plot,
#        filename = here("new_analysis_202406", "figures", "exp1_plot.png"),
#        dpi = 300,
#        height = 6,
#        width = 6)

3. Experiment 2: contact of phonological variation

3.1 Importing the data of Exp2

exp2_data_original <- read_csv(here("new_analysis_202406", "data_for_new_analysis", "cleaned_data", "exp2_new_cleaned_data_20240610.csv"))
## Rows: 7866 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): condition, participant, phase, pre_training_item, testing_stem, pa...
## dbl  (4): order, dominant_index, unseen_index, unseen_dominant_index
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
exp2_data_original

3.2 Filtering out participants with poor performances (unseen choices >= 1/3)

The rates in both the pre-contact testing and post-contact testing phases should be lower than 1/3

exp2_data_good <- exp2_data_original %>% 
  mutate(
    pre_testing_unseen_index = case_when(
      phase == "pre_testing" & unseen_index == 1 ~ 1,
      TRUE ~ 0
    ),
    post_testing_unseen_index = case_when(
      phase == "post_testing" & unseen_index == 1 ~ 1,
      TRUE ~ 0
    )
  ) %>% 
  group_by(participant) %>% 
  mutate(
    pre_testing_unseen_rate = sum(pre_testing_unseen_index) / 8,
    post_testing_unseen_rate = sum(post_testing_unseen_index) / 34
  ) %>% 
  ungroup() %>% 
  filter(
    pre_testing_unseen_rate < 1/3 & post_testing_unseen_rate < 1/3
  )

exp2_data_good

3.3 The participant distribution

3.3.1 Before exclusion

exp2_data_original %>% select(condition, participant) %>% 
  distinct() %>% 
  group_by(condition) %>% 
  count()

3.3.2 After exclusion

exp2_data_good %>% select(condition, participant) %>% 
  distinct() %>% 
  group_by(condition) %>% 
  count()

3.4 Checking the unseen choice trials

3.4.1 Keeping only the unseen choice trials

exp2_unseen_only <- exp2_data_good %>% 
  filter(unseen_index == 1)

exp2_unseen_only

3.4.2 Pre-contact testing phase: Checking the rates of choosing the unseen choices of the dominant pattern

No difference between two conditions; the unseen choice trials can be excluded

exp2_unseen_only %>% 
  filter(phase == "pre_testing") %>% 
  group_by(condition) %>% 
  summarize(total = length(unseen_harmony),
            n_dominant_unseen = sum(unseen_dominant_index),
            dominant_unseen_rate = n_dominant_unseen / total)
t.test(unseen_dominant_index ~ condition, data = exp2_unseen_only %>% filter(phase == "pre_testing")) %>% 
  tidy()

3.4.3 Post-contact testing phase: Checking the rates of choosing the unseen choices of the dominant pattern

No difference between two conditions; the unseen choice trials can be excluded

exp2_unseen_only %>% 
  filter(phase == "post_testing") %>% 
  group_by(condition) %>% 
  summarize(total = length(unseen_harmony),
            n_dominant_unseen = sum(unseen_dominant_index),
            dominant_unseen_rate = n_dominant_unseen / total)
t.test(unseen_dominant_index ~ condition, data = exp2_unseen_only %>% filter(phase == "post_testing")) %>% 
  tidy()

3.5 Finalizing the data for analysis

3.5.1 The data for the contact phase

exp2_data_contact <- exp2_data_good %>% 
  filter(phase == "contact")
exp2_data_contact

3.5.2 The data for the testing phases

Excluding the unseen choice trial

exp2_data_testing <- exp2_data_good %>% 
  filter(phase != "pre_training") %>% 
  filter(phase != "contact") %>% 
  filter(unseen_index == 0)
exp2_data_testing

3.6 Analysing the contact phase in Exp2

3.6.1 Preparing for the model

Setting the reference level: VH

exp2_data_contact$condition <- factor(exp2_data_contact$condition, levels = c("vd", "vh"))

Setting up sum coding scheme

contrasts(exp2_data_contact$condition) <- contr.sum(2)
contrasts(exp2_data_contact$condition)
##    [,1]
## vd    1
## vh   -1

3.6.2 The model

exp2_model_contact <- glmer(data = exp2_data_contact,
                            dominant_index ~ order * condition 
                            + (1 | participant) + (0 + order | participant) + (1 | testing_stem),
                            family = "binomial")
summary(exp2_model_contact)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: dominant_index ~ order * condition + (1 | participant) + (0 +  
##     order | participant) + (1 | testing_stem)
##    Data: exp2_data_contact
## 
##      AIC      BIC   logLik deviance df.resid 
##   2920.4   2960.3  -1453.2   2906.4     2201 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.6341 -1.0796  0.6014  0.8551  1.1590 
## 
## Random effects:
##  Groups        Name        Variance  Std.Dev.
##  participant   (Intercept) 0.0053188 0.07293 
##  participant.1 order       0.0001988 0.01410 
##  testing_stem  (Intercept) 0.0384349 0.19605 
## Number of obs: 2208, groups:  participant, 46; testing_stem, 16
## 
## Fixed effects:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       0.066390   0.102054   0.651    0.515    
## order             0.015239   0.003893   3.915 9.04e-05 ***
## condition1       -0.012193   0.089312  -0.137    0.891    
## order:condition1 -0.006116   0.003869  -1.581    0.114    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) order  cndtn1
## order       -0.634              
## condition1  -0.017  0.025       
## ordr:cndtn1  0.022 -0.041 -0.722

3.7 Plotting the contact phase in Exp2

3.7.1 The plot

exp2_data_contact_for_plot <- exp2_data_contact %>% 
  mutate(condition = case_when(
    condition == "vh" ~ "Harmony",
    condition == "vd" ~ "Disharmony"
  ))
exp2_data_contact_for_plot$condition <- factor(exp2_data_contact_for_plot$condition, levels = c("Harmony", "Disharmony"))
exp2_model_contact_for_plot <- glmer(data = exp2_data_contact_for_plot,
                                     dominant_index ~ order * condition 
                                     + (1 | participant) + (0 + order | participant) + (1 | testing_stem),
                                     family = "binomial")
exp2_plot_contact <- plot_model(exp2_model_contact_for_plot,
                                type = "pred",
                                terms = c("order [all]", "condition"),
                                title = "",
                                line.size = 2.5) +
  
  geom_hline(yintercept = 0.5,
             linetype = "dashed",
             size = 0.5,
             color = "grey70") +
  
  # the line colors
  scale_color_manual(values = c(color_list[1], color_list[2])) +
  
  # the error shade colors
  scale_fill_manual(values = c(color_list[1], color_list[2])) +
  
  scale_x_continuous(
    limits = c(1, 48),
    breaks = seq(0, 48, by = 8)
  ) +
  
  labs(x = "Trial during contact",
       y = "% Contact pattern",
       color = "Contact\nlanguage") +
  
  theme_bw(base_size = 25) +
  
  theme(
    axis.text.x = element_text(color = "black"),
    axis.text.y = element_text(color = "black"),
    legend.key.spacing.y = unit(0.5, "lines"),
    legend.box.spacing = margin(0.5)
  )
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
exp2_plot_contact

3.7.2 Saving the plot

# ggsave(plot = exp2_plot_contact,
#        filename = here("new_analysis_202406", "figures", "exp2_plot_contact.pdf"),
#        height = 6,
#        width = 8)
# 
# ggsave(plot = exp2_plot_contact,
#        filename = here("new_analysis_202406", "figures", "exp2_plot_contact.png"),
#        dpi = 300,
#        height = 6,
#        width = 8)

3.8 Analysing the contact phase in Exp2

3.8.1 Preparing for the model

Setting the reference level: VH

exp2_data_testing$condition <- factor(exp2_data_testing$condition, levels = c("vd", "vh"))
exp2_data_testing$phase <- factor(exp2_data_testing$phase, levels = c("post_testing", "pre_testing"))

Setting up sum coding scheme

contrasts(exp2_data_testing$condition) <- contr.sum(2)
contrasts(exp2_data_testing$condition)
##    [,1]
## vd    1
## vh   -1
contrasts(exp2_data_testing$phase) <- contr.sum(2)
contrasts(exp2_data_testing$phase)
##              [,1]
## post_testing    1
## pre_testing    -1

3.8.2 The model

exp2_model_testing <- glmer(data = exp2_data_testing,
                            dominant_index ~ condition * phase 
                            + (1 + phase | participant) + (1 | testing_stem),
                            family = "binomial")
summary(exp2_model_testing)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: dominant_index ~ condition * phase + (1 + phase | participant) +  
##     (1 | testing_stem)
##    Data: exp2_data_testing
## 
##      AIC      BIC   logLik deviance df.resid 
##   2538.4   2582.6  -1261.2   2522.4     1847 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.5605 -1.0462  0.7145  0.8907  1.4888 
## 
## Random effects:
##  Groups       Name        Variance Std.Dev. Corr
##  participant  (Intercept) 0.07346  0.2710       
##               phase1      0.03245  0.1801   0.59
##  testing_stem (Intercept) 0.03533  0.1880       
## Number of obs: 1855, groups:  participant, 46; testing_stem, 42
## 
## Fixed effects:
##                   Estimate Std. Error z value Pr(>|z|)  
## (Intercept)        0.14899    0.08071   1.846   0.0649 .
## condition1        -0.05537    0.07176  -0.772   0.4403  
## phase1             0.10873    0.07497   1.450   0.1470  
## condition1:phase1 -0.09505    0.06525  -1.457   0.1452  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cndtn1 phase1
## condition1   0.001              
## phase1      -0.385 -0.009       
## cndtn1:phs1 -0.009 -0.317  0.001

3.8.3 Running the post hoc analysis

exp2_posthoc_testing <- emmeans(exp2_model_testing, pairwise ~ condition * phase, adjust = "mvt",
                                type = "response") %>% 
  summary(infer = c(T, T))
exp2_posthoc_testing
## $emmeans
##  condition phase         prob     SE  df asymp.LCL asymp.UCL null z.ratio
##  vd        post_testing 0.527 0.0292 Inf     0.469     0.583  0.5   0.915
##  vh        post_testing 0.601 0.0285 Inf     0.544     0.655  0.5   3.439
##  vd        pre_testing  0.520 0.0428 Inf     0.436     0.603  0.5   0.466
##  vh        pre_testing  0.500 0.0425 Inf     0.418     0.583  0.5   0.003
##  p.value
##   0.3600
##   0.0006
##   0.6410
##   0.9973
## 
## Confidence level used: 0.95 
## Intervals are back-transformed from the logit scale 
## Tests are performed on the logit scale 
## 
## $contrasts
##  contrast                          odds.ratio    SE  df asymp.LCL asymp.UCL
##  vd post_testing / vh post_testing       0.74 0.119 Inf     0.490      1.12
##  vd post_testing / vd pre_testing        1.03 0.204 Inf     0.617      1.71
##  vd post_testing / vh pre_testing        1.11 0.230 Inf     0.655      1.89
##  vh post_testing / vd pre_testing        1.39 0.289 Inf     0.813      2.37
##  vh post_testing / vh pre_testing        1.50 0.299 Inf     0.903      2.50
##  vd pre_testing / vh pre_testing         1.08 0.241 Inf     0.612      1.92
##  null z.ratio p.value
##     1  -1.875  0.2357
##     1   0.138  0.9991
##     1   0.516  0.9545
##     1   1.574  0.3895
##     1   2.051  0.1669
##     1   0.357  0.9842
## 
## Confidence level used: 0.95 
## Conf-level adjustment: mvt method for 6 estimates 
## Intervals are back-transformed from the log odds ratio scale 
## P value adjustment: mvt method for 6 tests 
## Tests are performed on the log odds ratio scale

3.9 Plotting the testing phases in Exp2

3.9.1 Summary of estimates for plot

exp2_summary_testing <- emmeans(exp2_model_testing, pairwise ~ condition * phase,
                                type = "response")$emmeans %>%
  tidy(conf.int = TRUE)

exp2_summary_testing

3.9.2 The plot

Setting the level order

exp2_summary_testing$condition <- factor(exp2_summary_testing$condition, levels = c("vh", "vd"))
exp2_summary_testing$phase <- factor(exp2_summary_testing$phase, levels = c("pre_testing", "post_testing"))
exp2_plot_testing <- ggplot(
  data = exp2_summary_testing,
  aes(
    x = phase,
    y = prob,
    fill = condition
  )
) +
  
  geom_bar(stat = "identity",
           color = "black",
           position = "dodge",
           width = 0.8,
           alpha = 0.85) +
  
  geom_errorbar(aes(ymin = conf.low,
                    ymax = conf.high),
                position = position_dodge(0.8),
                color = "black",
                width = 0.2
                ) +
  
  geom_hline(yintercept = 0.5,
             linetype = "dashed",
             size = 0.75,
             color = "grey70") +
  
  annotate("text",
           x = 1.8,
           y = 0.72,
           label = "***",
           size = 12) +
  
  scale_fill_manual(values = c(color_list[1], color_list[2]),
                    labels = c("Harmony", "Disharmony")) +
  
  scale_x_discrete(limits = c("pre_testing", "post_testing"),
                   labels = c("Pre-contact", "Post-contact")) +
  
  scale_y_continuous(
    limits = c(0, 1),
    breaks = seq(0, 1, by = 0.25),
    labels = scales::percent
    ) +
  
  labs(x = "Testing phase", 
       y = "% Contact pattern",
       fill = "Contact\nlanguage") +
  
  theme_bw(base_size = 25) +
  
  theme(
    axis.text.x = element_text(colour = "black"),
    axis.text.y = element_text(colour = "black"),
    legend.key.spacing.y = unit(0.5, "lines"),
    legend.box.spacing = margin(0.5)
    )

exp2_plot_testing

### 3.9.3 Saving the plot

# ggsave(plot = exp2_plot_testing,
#        filename = here("new_analysis_202406", "figures", "exp2_plot_testing.pdf"),
#        height = 6,
#        width = 8.5)
# 
# ggsave(plot = exp2_plot_testing,
#        filename = here("new_analysis_202406", "figures", "exp2_plot_testing.png"),
#        dpi = 300,
#        height = 6,
#        width = 8.5)

4. Experiment 3: transmission of phonological variation

4.1 Importing the data of Exp1

exp3_data_original <- read_csv(here("new_analysis_202406", "data_for_new_analysis", "cleaned_data", "exp3_new_cleaned_data_20240608.csv"))
## Rows: 4416 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (19): condition, chain, generation, participant, participant_id, phase, ...
## dbl  (7): training_round, training_trial_in_round, training_trial_general, t...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
exp3_data_original

4.2 The participant distribution

Two conditions; Four chains per conditions; four generations;
A total of 32 participants.

exp3_data_original %>% select(condition, chain, generation, participant_id) %>% 
  distinct()

4.3 Checking the unseen choice trials

4.3.1 Keeping only the unseen response trials

exp3_unseen_only <- exp3_data_original %>% 
  filter(response_type == "unseen") %>% 
  mutate(
    unseen_dominant_index = case_when(
      condition == "vh" & unseen_type == "harmonic" ~ 1,
      condition == "vd" & unseen_type == "disharmonic" ~ 1,
      TRUE ~ 0
    ))

exp3_unseen_only

4.3.2 Checking the rates of choosing the unseen choices of the dominant pattern

No difference between two conditions

exp3_unseen_only %>% 
  group_by(condition) %>% 
  summarize(total = length(unseen_type),
            n_dominant_unseen = sum(unseen_dominant_index),
            dominant_unseen_rate = n_dominant_unseen / total)
t.test(unseen_dominant_index ~ condition, data = exp3_unseen_only) %>% 
  tidy()

4.4 Finalizing the data for analysis

exp3_data <- exp3_data_original %>% 
  filter(phase == "testing") %>% 
  mutate(
    dominant_index = case_when(
      condition == "vh" & response_type == "harmonic" ~ 1,
      condition == "vd" & response_type == "disharmonic" ~ 1,
      TRUE ~ 0
    )
  )

exp3_data 

4.5 Running a regression model

4.5.1 Preparing for the model

Setting up the reference levels: VH for condition; gen1 for generation

exp3_data$condition <- factor(exp3_data$condition, levels = c("vd", "vh"))
exp3_data$generation <- factor(exp3_data$generation, levels = c("gen2", "gen3", "gen4", "gen1"))

Setting up sum coding scheme

contrasts(exp3_data$condition) <- contr.sum(2)
contrasts(exp3_data$generation) <- contr.sum(4)

contrasts(exp3_data$condition)
##    [,1]
## vd    1
## vh   -1
contrasts(exp3_data$generation)
##      [,1] [,2] [,3]
## gen2    1    0    0
## gen3    0    1    0
## gen4    0    0    1
## gen1   -1   -1   -1

4.5.2 The model

exp3_model <- glmer(data = exp3_data,
                    dominant_index ~ condition * generation + (1 | participant_id),
                    family = binomial)
summary(exp3_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: dominant_index ~ condition * generation + (1 | participant_id)
##    Data: exp3_data
## 
##      AIC      BIC   logLik deviance df.resid 
##   1849.2   1896.0   -915.6   1831.2     1335 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.3756 -0.9032 -0.7537  1.0853  1.4271 
## 
## Random effects:
##  Groups         Name        Variance Std.Dev.
##  participant_id (Intercept) 0.04027  0.2007  
## Number of obs: 1344, groups:  participant_id, 32
## 
## Fixed effects:
##                        Estimate Std. Error z value Pr(>|z|)  
## (Intercept)            -0.14259    0.06584  -2.166   0.0303 *
## condition1             -0.14356    0.06585  -2.180   0.0292 *
## generation1            -0.28544    0.11469  -2.489   0.0128 *
## generation2            -0.05012    0.11362  -0.441   0.6592  
## generation3             0.06958    0.11371   0.612   0.5406  
## condition1:generation1  0.05484    0.11466   0.478   0.6325  
## condition1:generation2  0.14339    0.11363   1.262   0.2070  
## condition1:generation3 -0.04982    0.11371  -0.438   0.6613  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) cndtn1 gnrtn1 gnrtn2 gnrtn3 cnd1:1 cnd1:2
## condition1   0.002                                          
## generation1  0.010  0.008                                   
## generation2 -0.006 -0.001 -0.335                            
## generation3 -0.005  0.002 -0.335 -0.329                     
## cndtn1:gnr1  0.007  0.010  0.010 -0.004 -0.006              
## cndtn1:gnr2 -0.001 -0.006 -0.005  0.001 -0.001 -0.335       
## cndtn1:gnr3  0.002 -0.005 -0.006 -0.001  0.004 -0.335 -0.329

4.5.3 Running the post hoc analysis

exp3_posthoc <- emmeans(exp3_model, pairwise ~ generation | condition, adjust = "mvt", 
                        type = "response") %>% 
  summary(infer = c(T, T))
exp3_posthoc
## $emmeans
## condition = vd:
##  generation  prob     SE  df asymp.LCL asymp.UCL null z.ratio p.value
##  gen2       0.374 0.0443 Inf     0.292     0.464  0.5  -2.732  0.0063
##  gen3       0.452 0.0459 Inf     0.364     0.542  0.5  -1.041  0.2978
##  gen4       0.434 0.0457 Inf     0.347     0.524  0.5  -1.433  0.1519
##  gen1       0.458 0.0460 Inf     0.370     0.548  0.5  -0.911  0.3625
## 
## condition = vh:
##  generation  prob     SE  df asymp.LCL asymp.UCL null z.ratio p.value
##  gen2       0.416 0.0453 Inf     0.331     0.507  0.5  -1.820  0.0688
##  gen3       0.452 0.0459 Inf     0.365     0.542  0.5  -1.040  0.2984
##  gen4       0.530 0.0461 Inf     0.440     0.618  0.5   0.651  0.5152
##  gen1       0.602 0.0449 Inf     0.512     0.686  0.5   2.214  0.0268
## 
## Confidence level used: 0.95 
## Intervals are back-transformed from the logit scale 
## Tests are performed on the logit scale 
## 
## $contrasts
## condition = vd:
##  contrast    odds.ratio    SE  df asymp.LCL asymp.UCL null z.ratio p.value
##  gen2 / gen3      0.723 0.192 Inf     0.366     1.428    1  -1.223  0.6119
##  gen2 / gen4      0.779 0.206 Inf     0.394     1.539    1  -0.944  0.7810
##  gen2 / gen1      0.706 0.187 Inf     0.358     1.394    1  -1.316  0.5529
##  gen3 / gen4      1.076 0.283 Inf     0.548     2.113    1   0.280  0.9923
##  gen3 / gen1      0.976 0.256 Inf     0.498     1.913    1  -0.093  0.9997
##  gen4 / gen1      0.907 0.238 Inf     0.462     1.780    1  -0.373  0.9823
## 
## condition = vh:
##  contrast    odds.ratio    SE  df asymp.LCL asymp.UCL null z.ratio p.value
##  gen2 / gen3      0.863 0.227 Inf     0.439     1.697    1  -0.559  0.9442
##  gen2 / gen4      0.631 0.166 Inf     0.321     1.241    1  -1.750  0.2977
##  gen2 / gen1      0.470 0.124 Inf     0.238     0.928    1  -2.853  0.0228
##  gen3 / gen4      0.731 0.191 Inf     0.373     1.433    1  -1.196  0.6296
##  gen3 / gen1      0.545 0.144 Inf     0.276     1.072    1  -2.306  0.0966
##  gen4 / gen1      0.745 0.196 Inf     0.378     1.466    1  -1.120  0.6773
## 
## Confidence level used: 0.95 
## Conf-level adjustment: mvt method for 6 estimates 
## Intervals are back-transformed from the log odds ratio scale 
## P value adjustment: mvt method for 6 tests 
## Tests are performed on the log odds ratio scale

4.5.4 Comparing generation 1 against the initial input

The VH dominant condition

exp3_data_gen1_vh <- exp3_data %>% filter(generation == "gen1" & condition == "vh")
t.test(exp3_data_gen1_vh$dominant_index, mu = 0.72) %>% tidy()

The VD dominant condition

exp3_data_gen1_vd <- exp3_data %>% filter(generation == "gen1" & condition == "vd")
t.test(exp3_data_gen1_vd$dominant_index, mu = 0.72) %>% tidy()

4.6 Plotting Exp3

4.6.1 Summary by each individual chain

From gen1 to gen4

exp3_summary_by_chain_gen1_to_gen4 <-
  exp3_data %>% 
  group_by(generation, chain, condition, participant_id) %>% 
  summarize(
    dominant_count = sum(dominant_index),
    dominant_rate = dominant_count / 42
    )
## `summarise()` has grouped output by 'generation', 'chain', 'condition'. You can
## override using the `.groups` argument.
exp3_summary_by_chain_gen1_to_gen4

Adding the initial input

exp3_summary_by_chain_initial <- exp3_summary_by_chain_gen1_to_gen4 %>% 
  filter(generation == "gen1") %>% 
  mutate(generation = "gen0",
         dominant_count = 23,
         dominant_rate = 0.72,
         participant_id = as.character(str_glue("{generation}_{chain}")))

exp3_summary_by_chain_initial

Combining the initial and subsequent generations

exp3_summary_by_chain <- rbind(exp3_summary_by_chain_initial, exp3_summary_by_chain_gen1_to_gen4)
exp3_summary_by_chain

4.6.2 Summary of estimates

From gen1 to gen4

exp3_summary_est_gen1_to_gen4 <- emmeans(exp3_model, pairwise ~ generation | condition, adjust = "mvt", type = "response")$emmeans %>% 
  tidy(conf.int = TRUE)

exp3_summary_est_gen1_to_gen4

Adding the initial input

exp3_summary_est_initial <- exp3_summary_est_gen1_to_gen4 %>% 
  filter(generation == "gen1") %>% 
  mutate(generation = "gen0",
         prob = 0.72,
         std.error = NA,
         conf.low = 0,
         conf.high = 0,
         statistic = NA,
         p.value = NA)

exp3_summary_est_initial

Combining the initial and subsequent generations

exp3_summary_est <- rbind(exp3_summary_est_gen1_to_gen4, exp3_summary_est_initial) %>% 
  rename(dominant_rate = prob)
exp3_summary_est

4.6.3 The plot

Setting the level order

exp3_summary_by_chain$condition <- factor(exp3_summary_by_chain$condition, levels = c("vh", "vd"))
exp3_summary_by_chain$generation <- factor(exp3_summary_by_chain$generation, levels = c("gen0", "gen1", "gen2", "gen3", "gen4"))
exp3_summary_est$condition <- factor(exp3_summary_est$condition, levels = c("vh", "vd"))
exp3_summary_est$generation <- factor(exp3_summary_est$generation, levels = c("gen0", "gen1", "gen2", "gen3", "gen4"))

Preparing the significance bars

exp3_signif <- data.frame(
  condition = c("vh", "vh", "vd"),
  start = c("gen0", "gen1", "gen0"),
  end = c("gen1", "gen2", "gen1"),
  y = c(0.82, 0.75, 0.8),
  label = c("**", "*", "***")
)

exp3_signif$condition <- factor(exp3_signif$condition, levels = c("vh", "vd"))
exp3_plot <- ggplot() +

  # individual chains
  geom_point(
    data = exp3_summary_by_chain,
    aes(
      x = generation,
      y = dominant_rate,
      group = chain
    ),
    color = "grey80"
  ) +

  geom_line(
    data = exp3_summary_by_chain,
    aes(
      x = generation,
      y = dominant_rate,
      group = chain
    ),
    color = "grey80"
  ) +

  # the estimates
  geom_errorbar(
    data = exp3_summary_est,
    aes(
      x = generation,
      ymax = conf.high,
      ymin = conf.low,
      group = condition
    ),
    width = 0.5
  ) + 
  
  geom_point(
    data = exp3_summary_est,
    aes(
      x = generation,
      y = dominant_rate,
      group = condition,
      color = condition
    ),
    size = 4
  ) +
  
  geom_line(
    data = exp3_summary_est,
    aes(
      x = generation,
      y = dominant_rate,
      group = condition,
      color = condition
    ),
    linewidth = 2.5
  ) +  
  
  scale_color_manual(values = c(color_list[1], color_list[2])) +
  
  scale_x_discrete(limits = c("gen0", "gen1", "gen2", "gen3", "gen4"),
                   labels = c("Initial", "1", "2", "3", "4")) +
  
  scale_y_continuous(
    limits = c(0.2, 0.9),
    breaks = seq(0.2, 0.9, by = 0.1),
    labels = scales::percent) +
  
  labs(x = "Generation", 
       y = "% Dominant pattern") +
  
  geom_hline(yintercept = 0.5,
             linetype = "dashed",
             size = 0.75,
             color = "grey70") +
  
  geom_hline(yintercept = 0.72,
             linetype = "dashed",
             size = 0.75,
             color = "grey70") +
  
  geom_signif(
    data = exp3_signif,
    aes(
      xmin = start,
      xmax = end,
      annotations = label,
      y_position = y
    ),
    textsize = 12,
    manual = TRUE
  ) +
  
  facet_grid(. ~ condition,
             labeller = labeller(
               condition = c(vh = "Harmony-dominant", vd = "Disharmony-dominant")
               )
             ) +
  
  theme_bw(base_size = 25) +
  
  theme(
    legend.position = "none",
    axis.text.x = element_text(colour = "black"),
    axis.text.y = element_text(colour = "black")
  )
## Warning in geom_signif(data = exp3_signif, aes(xmin = start, xmax = end, :
## Ignoring unknown aesthetics: xmin, xmax, annotations, and y_position
exp3_plot

4.6.4 Saving the plot

# ggsave(plot = exp3_plot,
#        filename = here("new_analysis_202406", "figures", "exp3_plot.pdf"),
#        height = 6,
#        width = 10)
# 
# ggsave(plot = exp3_plot,
#        filename = here("new_analysis_202406", "figures", "exp3_plot.png"),
#        dpi = 300,
#        height = 6,
#        width = 10)